# Importing necessary libraries
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
# Read the CSV file into a pandas DataFrame
A=pd.read_csv("US_Accidents_Dec21_updated.csv")
data.head(20)
| ID | Severity | Start_Time | End_Time | Start_Lat | Start_Lng | End_Lat | End_Lng | Distance(mi) | Description | ... | Roundabout | Station | Stop | Traffic_Calming | Traffic_Signal | Turning_Loop | Sunrise_Sunset | Civil_Twilight | Nautical_Twilight | Astronomical_Twilight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A-1 | 3 | 2016-02-08 00:37:08 | 2016-02-08 06:37:08 | 40.108910 | -83.092860 | 40.112060 | -83.031870 | 3.230 | Between Sawmill Rd/Exit 20 and OH-315/Olentang... | ... | False | False | False | False | False | False | Night | Night | Night | Night |
| 1 | A-2 | 2 | 2016-02-08 05:56:20 | 2016-02-08 11:56:20 | 39.865420 | -84.062800 | 39.865010 | -84.048730 | 0.747 | At OH-4/OH-235/Exit 41 - Accident. | ... | False | False | False | False | False | False | Night | Night | Night | Night |
| 2 | A-3 | 2 | 2016-02-08 06:15:39 | 2016-02-08 12:15:39 | 39.102660 | -84.524680 | 39.102090 | -84.523960 | 0.055 | At I-71/US-50/Exit 1 - Accident. | ... | False | False | False | False | False | False | Night | Night | Night | Day |
| 3 | A-4 | 2 | 2016-02-08 06:51:45 | 2016-02-08 12:51:45 | 41.062130 | -81.537840 | 41.062170 | -81.535470 | 0.123 | At Dart Ave/Exit 21 - Accident. | ... | False | False | False | False | False | False | Night | Night | Day | Day |
| 4 | A-5 | 3 | 2016-02-08 07:53:43 | 2016-02-08 13:53:43 | 39.172393 | -84.492792 | 39.170476 | -84.501798 | 0.500 | At Mitchell Ave/Exit 6 - Accident. | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| 5 | A-6 | 2 | 2016-02-08 08:16:57 | 2016-02-08 14:16:57 | 39.063240 | -84.032430 | 39.067310 | -84.058510 | 1.427 | At Dela Palma Rd - Accident. | ... | False | False | False | False | True | False | Day | Day | Day | Day |
| 6 | A-7 | 2 | 2016-02-08 08:15:41 | 2016-02-08 14:15:41 | 39.775650 | -84.186030 | 39.772750 | -84.188050 | 0.227 | At OH-4/Exit 54 - Accident. | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| 7 | A-8 | 2 | 2016-02-08 11:51:46 | 2016-02-08 17:51:46 | 41.375310 | -81.820170 | 41.367860 | -81.821740 | 0.521 | At Bagley Rd/Exit 235 - Accident. | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| 8 | A-9 | 2 | 2016-02-08 14:19:57 | 2016-02-08 20:19:57 | 40.702247 | -84.075887 | 40.699110 | -84.084293 | 0.491 | At OH-65/Exit 122 - Accident. | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| 9 | A-10 | 2 | 2016-02-08 15:16:43 | 2016-02-08 21:16:43 | 40.109310 | -82.968490 | 40.110780 | -82.984000 | 0.826 | At I-71/Exit 26 - Accident. | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| 10 | A-11 | 2 | 2016-02-08 15:43:50 | 2016-02-08 21:43:50 | 39.192880 | -84.477230 | 39.196150 | -84.473350 | 0.307 | At OH-4/Paddock Rd/Exit 9 - Accident. | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| 11 | A-12 | 2 | 2016-02-08 16:50:57 | 2016-02-08 22:50:57 | 39.138770 | -84.533940 | 39.139770 | -84.534300 | 0.072 | At US-52/Hopple St/Exit 3 - Accident. | ... | False | False | False | False | True | False | Day | Day | Day | Day |
| 12 | A-13 | 2 | 2016-02-08 17:27:39 | 2016-02-08 23:27:39 | 41.473900 | -81.704233 | 41.473880 | -81.705590 | 0.070 | At US-42/Exit 170 - Accident. | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| 13 | A-14 | 2 | 2016-02-08 17:30:18 | 2016-02-08 23:30:18 | 39.582242 | -83.677814 | 39.603013 | -83.637319 | 2.590 | Between OH-72/Exit 58 and US-35/Exit 65 - Acci... | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| 14 | A-15 | 3 | 2016-02-08 18:11:11 | 2016-02-09 00:11:11 | 40.151785 | -81.312635 | 40.151747 | -81.312682 | 0.004 | At Shipley Rd - Accident. | ... | False | False | False | False | False | False | Night | Day | Day | Day |
| 15 | A-16 | 3 | 2016-02-08 18:11:11 | 2016-02-09 00:11:11 | 40.151747 | -81.312682 | 40.151785 | -81.312635 | 0.004 | At Titus Rd - Accident. | ... | False | False | False | False | False | False | Night | Day | Day | Day |
| 16 | A-17 | 3 | 2016-02-08 19:47:42 | 2016-02-09 01:47:42 | 39.972410 | -82.846950 | 39.985290 | -82.856670 | 1.028 | At OH-16/Broad St/Exit 39 - Accident. | ... | False | False | False | False | False | False | Night | Night | Night | Night |
| 17 | A-18 | 2 | 2016-02-08 19:47:42 | 2016-02-09 01:47:42 | 39.983800 | -82.856570 | 39.972500 | -82.847460 | 0.918 | At I-270 - Accident. | ... | False | False | False | False | False | False | Night | Night | Night | Night |
| 18 | A-19 | 3 | 2016-02-08 20:13:22 | 2016-02-09 02:13:22 | 40.026640 | -82.994400 | 40.012280 | -82.992180 | 0.999 | Between Weber Rd/Exit 113 and Hudson St/Exit 1... | ... | False | False | False | False | False | False | Night | Night | Night | Night |
| 19 | A-20 | 4 | 2016-02-08 21:00:17 | 2016-02-09 03:00:17 | 41.679361 | -83.573037 | 41.666124 | -83.566335 | 0.978 | Closed between I-475/Exit 204 and US-24/Detroi... | ... | False | False | False | False | False | False | Night | Night | Night | Night |
20 rows × 47 columns
A.shape
(2845342, 47)
The data has about 2.8 million rows and 47 columns.
A.columns
Index(['ID', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Number', 'Street',
'Side', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
'Astronomical_Twilight'],
dtype='object')
A.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2845342 entries, 0 to 2845341 Data columns (total 47 columns): # Column Dtype --- ------ ----- 0 ID object 1 Severity int64 2 Start_Time object 3 End_Time object 4 Start_Lat float64 5 Start_Lng float64 6 End_Lat float64 7 End_Lng float64 8 Distance(mi) float64 9 Description object 10 Number float64 11 Street object 12 Side object 13 City object 14 County object 15 State object 16 Zipcode object 17 Country object 18 Timezone object 19 Airport_Code object 20 Weather_Timestamp object 21 Temperature(F) float64 22 Wind_Chill(F) float64 23 Humidity(%) float64 24 Pressure(in) float64 25 Visibility(mi) float64 26 Wind_Direction object 27 Wind_Speed(mph) float64 28 Precipitation(in) float64 29 Weather_Condition object 30 Amenity bool 31 Bump bool 32 Crossing bool 33 Give_Way bool 34 Junction bool 35 No_Exit bool 36 Railway bool 37 Roundabout bool 38 Station bool 39 Stop bool 40 Traffic_Calming bool 41 Traffic_Signal bool 42 Turning_Loop bool 43 Sunrise_Sunset object 44 Civil_Twilight object 45 Nautical_Twilight object 46 Astronomical_Twilight object dtypes: bool(13), float64(13), int64(1), object(20) memory usage: 773.4+ MB
13 boolean type, 13 float type, 1 integer, and 20 object type of data.
# Get the count of missing values in each column and sort them in descending order
A.isna().sum().sort_values(ascending=False)
Number 1743911 Precipitation(in) 549458 Wind_Chill(F) 469643 Wind_Speed(mph) 157944 Wind_Direction 73775 Humidity(%) 73092 Weather_Condition 70636 Visibility(mi) 70546 Temperature(F) 69274 Pressure(in) 59200 Weather_Timestamp 50736 Airport_Code 9549 Timezone 3659 Nautical_Twilight 2867 Civil_Twilight 2867 Sunrise_Sunset 2867 Astronomical_Twilight 2867 Zipcode 1319 City 137 Street 2 Country 0 Junction 0 Start_Time 0 End_Time 0 Start_Lat 0 Turning_Loop 0 Traffic_Signal 0 Traffic_Calming 0 Stop 0 Station 0 Roundabout 0 Railway 0 No_Exit 0 Crossing 0 Give_Way 0 Bump 0 Amenity 0 Start_Lng 0 End_Lat 0 End_Lng 0 Distance(mi) 0 Description 0 Severity 0 Side 0 County 0 State 0 ID 0 dtype: int64
# Check for duplicates in A
A.duplicated()
0 False
1 False
2 False
3 False
4 False
...
2845337 False
2845338 False
2845339 False
2845340 False
2845341 False
Length: 2845342, dtype: bool
A.corr()
| Severity | Start_Lat | Start_Lng | End_Lat | End_Lng | Distance(mi) | Number | Temperature(F) | Wind_Chill(F) | Humidity(%) | ... | Give_Way | Junction | No_Exit | Railway | Roundabout | Station | Stop | Traffic_Calming | Traffic_Signal | Turning_Loop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Severity | 1.000000 | 0.089207 | 0.114063 | 0.089209 | 0.114064 | 0.092140 | -0.000689 | -0.045335 | -0.097459 | 0.037802 | ... | 0.008141 | 0.045928 | -0.002359 | -0.001747 | -0.000773 | -0.018017 | -0.002536 | -0.003009 | -0.012798 | NaN |
| Start_Lat | 0.089207 | 1.000000 | -0.154965 | 0.999995 | -0.154962 | 0.071588 | 0.101281 | -0.475065 | -0.505567 | 0.005876 | ... | 0.007851 | 0.043628 | -0.017348 | 0.003184 | -0.003366 | -0.107697 | 0.010817 | -0.003164 | -0.058208 | NaN |
| Start_Lng | 0.114063 | -0.154965 | 1.000000 | -0.154956 | 0.999999 | 0.039861 | -0.170566 | 0.032159 | 0.010849 | 0.170909 | ... | 0.018110 | -0.017202 | 0.006220 | -0.015450 | 0.000450 | 0.038533 | -0.040265 | -0.007750 | 0.059568 | NaN |
| End_Lat | 0.089209 | 0.999995 | -0.154956 | 1.000000 | -0.154953 | 0.071593 | 0.101270 | -0.475058 | -0.505561 | 0.005869 | ... | 0.007840 | 0.043621 | -0.017347 | 0.003184 | -0.003366 | -0.107696 | 0.010827 | -0.003163 | -0.058210 | NaN |
| End_Lng | 0.114064 | -0.154962 | 0.999999 | -0.154953 | 1.000000 | 0.039830 | -0.170563 | 0.032163 | 0.010852 | 0.170904 | ... | 0.018110 | -0.017199 | 0.006220 | -0.015449 | 0.000450 | 0.038533 | -0.040264 | -0.007750 | 0.059566 | NaN |
| Distance(mi) | 0.092140 | 0.071588 | 0.039861 | 0.071593 | 0.039830 | 1.000000 | 0.067331 | -0.051212 | -0.060094 | 0.026860 | ... | -0.006655 | 0.022442 | -0.010184 | -0.021462 | -0.002491 | -0.052826 | -0.027108 | -0.007322 | -0.105722 | NaN |
| Number | -0.000689 | 0.101281 | -0.170566 | 0.101270 | -0.170563 | 0.067331 | 1.000000 | -0.051146 | -0.049400 | 0.015825 | ... | -0.003792 | -0.010801 | -0.005052 | -0.013668 | -0.001360 | -0.028485 | -0.003688 | -0.005075 | -0.058870 | NaN |
| Temperature(F) | -0.045335 | -0.475065 | 0.032159 | -0.475058 | 0.032163 | -0.051212 | -0.051146 | 1.000000 | 0.993857 | -0.366527 | ... | -0.005524 | -0.020358 | 0.011610 | 0.003108 | 0.002122 | 0.061248 | 0.000021 | 0.005672 | 0.047657 | NaN |
| Wind_Chill(F) | -0.097459 | -0.505567 | 0.010849 | -0.505561 | 0.010852 | -0.060094 | -0.049400 | 0.993857 | 1.000000 | -0.346028 | ... | -0.007584 | -0.048784 | 0.012066 | 0.005062 | 0.002466 | 0.069379 | 0.005424 | 0.007140 | 0.048321 | NaN |
| Humidity(%) | 0.037802 | 0.005876 | 0.170909 | 0.005869 | 0.170904 | 0.026860 | 0.015825 | -0.366527 | -0.346028 | 1.000000 | ... | 0.000063 | 0.006394 | -0.007099 | -0.000309 | 0.000904 | -0.000095 | -0.015631 | -0.005727 | -0.034133 | NaN |
| Pressure(in) | 0.043883 | -0.234889 | 0.208828 | -0.234896 | 0.208830 | -0.069115 | -0.058449 | 0.137756 | 0.134312 | 0.138816 | ... | -0.000727 | 0.051646 | -0.000781 | 0.015718 | 0.000742 | 0.040853 | -0.015793 | 0.000232 | 0.015351 | NaN |
| Visibility(mi) | 0.007371 | -0.086538 | 0.029336 | -0.086532 | 0.029338 | -0.034001 | -0.018511 | 0.211134 | 0.217573 | -0.363305 | ... | 0.002445 | -0.007694 | 0.007327 | 0.002338 | 0.000107 | 0.020449 | 0.002569 | 0.003880 | 0.030796 | NaN |
| Wind_Speed(mph) | 0.048382 | 0.029158 | 0.090564 | 0.029162 | 0.090567 | 0.011127 | 0.013933 | 0.078788 | 0.013011 | -0.174100 | ... | 0.002457 | 0.018484 | 0.002117 | -0.000342 | 0.000217 | 0.016019 | -0.006544 | -0.000406 | 0.016800 | NaN |
| Precipitation(in) | 0.013845 | -0.002835 | 0.024094 | -0.002836 | 0.024094 | 0.003050 | -0.000188 | -0.004425 | -0.008289 | 0.077663 | ... | -0.001313 | 0.014892 | 0.000341 | -0.000027 | -0.000003 | -0.001453 | -0.004493 | -0.001228 | -0.002458 | NaN |
| Amenity | -0.003940 | -0.005879 | 0.014923 | -0.005878 | 0.014924 | -0.032717 | -0.032279 | 0.013420 | 0.016908 | -0.005963 | ... | 0.003297 | -0.026413 | 0.013096 | 0.034372 | 0.000428 | 0.124395 | 0.027137 | 0.010624 | 0.090555 | NaN |
| Bump | -0.003739 | 0.000338 | -0.014867 | 0.000339 | -0.014867 | -0.005408 | -0.003719 | 0.003967 | 0.005370 | -0.007727 | ... | -0.000176 | -0.001792 | 0.002610 | 0.006451 | -0.000125 | 0.006027 | 0.019402 | 0.772162 | -0.003714 | NaN |
| Crossing | -0.035382 | -0.094698 | 0.056169 | -0.094703 | 0.056165 | -0.091256 | -0.054667 | 0.069728 | 0.077446 | -0.030003 | ... | 0.053507 | -0.080214 | 0.042085 | 0.207132 | -0.000764 | 0.144655 | 0.086878 | 0.026087 | 0.422232 | NaN |
| Give_Way | 0.008141 | 0.007851 | 0.018110 | 0.007840 | 0.018110 | -0.006655 | -0.003792 | -0.005524 | -0.007584 | 0.000063 | ... | 1.000000 | -0.007007 | 0.004176 | 0.003981 | 0.002945 | -0.002163 | 0.047691 | 0.000253 | 0.056994 | NaN |
| Junction | 0.045928 | 0.043628 | -0.017202 | 0.043621 | -0.017199 | 0.022442 | -0.010801 | -0.020358 | -0.048784 | 0.006394 | ... | -0.007007 | 1.000000 | -0.003870 | -0.010436 | 0.012083 | -0.044455 | -0.034977 | -0.002262 | -0.096142 | NaN |
| No_Exit | -0.002359 | -0.017348 | 0.006220 | -0.017347 | 0.006220 | -0.010184 | -0.005052 | 0.011610 | 0.012066 | -0.007099 | ... | 0.004176 | -0.003870 | 1.000000 | 0.002736 | -0.000256 | 0.015199 | 0.011802 | 0.001631 | 0.023376 | NaN |
| Railway | -0.001747 | 0.003184 | -0.015450 | 0.003184 | -0.015449 | -0.021462 | -0.013668 | 0.003108 | 0.005062 | -0.000309 | ... | 0.003981 | -0.010436 | 0.002736 | 1.000000 | -0.000589 | 0.109353 | 0.007893 | 0.005064 | 0.053727 | NaN |
| Roundabout | -0.000773 | -0.003366 | 0.000450 | -0.003366 | 0.000450 | -0.002491 | -0.001360 | 0.002122 | 0.002466 | 0.000904 | ... | 0.002945 | 0.012083 | -0.000256 | -0.000589 | 1.000000 | 0.000371 | 0.006412 | 0.002019 | -0.002108 | NaN |
| Station | -0.018017 | -0.107697 | 0.038533 | -0.107696 | 0.038533 | -0.052826 | -0.028485 | 0.061248 | 0.069379 | -0.000095 | ... | -0.002163 | -0.044455 | 0.015199 | 0.109353 | 0.000371 | 1.000000 | 0.027295 | 0.009768 | 0.112593 | NaN |
| Stop | -0.002536 | 0.010817 | -0.040265 | 0.010827 | -0.040264 | -0.027108 | -0.003688 | 0.000021 | 0.005424 | -0.015631 | ... | 0.047691 | -0.034977 | 0.011802 | 0.007893 | 0.006412 | 0.027295 | 1.000000 | 0.017349 | -0.028258 | NaN |
| Traffic_Calming | -0.003009 | -0.003164 | -0.007750 | -0.003163 | -0.007750 | -0.007322 | -0.005075 | 0.005672 | 0.007140 | -0.005727 | ... | 0.000253 | -0.002262 | 0.001631 | 0.005064 | 0.002019 | 0.009768 | 0.017349 | 1.000000 | 0.008646 | NaN |
| Traffic_Signal | -0.012798 | -0.058208 | 0.059568 | -0.058210 | 0.059566 | -0.105722 | -0.058870 | 0.047657 | 0.048321 | -0.034133 | ... | 0.056994 | -0.096142 | 0.023376 | 0.053727 | -0.002108 | 0.112593 | -0.028258 | 0.008646 | 1.000000 | NaN |
| Turning_Loop | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
27 rows × 27 columns
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))
sns.heatmap(A.corr(), annot=True,fmt='.2f')
plt.show()
# Remove specified columns from dataframe A and store the result in dataframe B
B= A.drop(labels = ["Start_Lat", "End_Lat", "Start_Lng", "End_Lng", "Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop", "Sunrise_Sunset", "Civil_Twilight", "Nautical_Twilight", "Astronomical_Twilight"], axis = 1, inplace = False)
B.shape
(2845342, 26)
B
Compare severity of accidents & find how many accidents impacted most on traffic?
Severity column shows the severity of the accident, a number between 1 and 4, where 1 indicates the least impact on traffic (i.e., short delay)
# Count the occurrences of each severity level in the 'ID' column and print the result
print(B.groupby('Severity')[['ID']].count())
# Plot a bar chart showing the count of each severity level in the 'ID' column
B.groupby('Severity')[['ID']].count().plot(kind='bar')
ID Severity 1 26053 2 2532991 3 155105 4 131193
Most of the accidents were with severity 2. 131193 accidents had highest severity (severity 4) i.e. their impact on traffic was highest.
print(B.columns)
Index(['ID', 'Severity', 'Start_Time', 'End_Time', 'Distance(mi)',
'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
'Precipitation(in)', 'Weather_Condition'],
dtype='object')
B.shape
(2845342, 26)
# Create a scatter plot
graph_1 = px.scatter(B, x = "Severity", y = "Visibility(mi)", title = "CORRELATION")
graph_1